import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.style.use('dark_background')
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_roc_curve
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from tqdm import tqdm_notebook
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
from luciferml.preprocessing import Preprocess as prep
data = pd.read_csv('water_potability.csv')
data.head()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 3.716080 | 129.422921 | 18630.057858 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 100.341674 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | 0 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3276 entries, 0 to 3275 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 2785 non-null float64 1 Hardness 3276 non-null float64 2 Solids 3276 non-null float64 3 Chloramines 3276 non-null float64 4 Sulfate 2495 non-null float64 5 Conductivity 3276 non-null float64 6 Organic_carbon 3276 non-null float64 7 Trihalomethanes 3114 non-null float64 8 Turbidity 3276 non-null float64 9 Potability 3276 non-null int64 dtypes: float64(9), int64(1) memory usage: 256.1 KB
data.describe()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 2785.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 2495.000000 | 3276.000000 | 3276.000000 | 3114.000000 | 3276.000000 | 3276.000000 |
| mean | 7.080795 | 196.369496 | 22014.092526 | 7.122277 | 333.775777 | 426.205111 | 14.284970 | 66.396293 | 3.966786 | 0.390110 |
| std | 1.594320 | 32.879761 | 8768.570828 | 1.583085 | 41.416840 | 80.824064 | 3.308162 | 16.175008 | 0.780382 | 0.487849 |
| min | 0.000000 | 47.432000 | 320.942611 | 0.352000 | 129.000000 | 181.483754 | 2.200000 | 0.738000 | 1.450000 | 0.000000 |
| 25% | 6.093092 | 176.850538 | 15666.690297 | 6.127421 | 307.699498 | 365.734414 | 12.065801 | 55.844536 | 3.439711 | 0.000000 |
| 50% | 7.036752 | 196.967627 | 20927.833607 | 7.130299 | 333.073546 | 421.884968 | 14.218338 | 66.622485 | 3.955028 | 0.000000 |
| 75% | 8.062066 | 216.667456 | 27332.762127 | 8.114887 | 359.950170 | 481.792304 | 16.557652 | 77.337473 | 4.500320 | 1.000000 |
| max | 14.000000 | 323.124000 | 61227.196008 | 13.127000 | 481.030642 | 753.342620 | 28.300000 | 124.000000 | 6.739000 | 1.000000 |
data.shape
(3276, 10)
data.isnull().sum()
ph 491 Hardness 0 Solids 0 Chloramines 0 Sulfate 781 Conductivity 0 Organic_carbon 0 Trihalomethanes 162 Turbidity 0 Potability 0 dtype: int64
data = data.dropna()
data.isnull().sum()
ph 0 Hardness 0 Solids 0 Chloramines 0 Sulfate 0 Conductivity 0 Organic_carbon 0 Trihalomethanes 0 Turbidity 0 Potability 0 dtype: int64
plt.figure(figsize=(10,6)) #setting the figure size
sns.countplot(data['Potability'], palette='rocket') # checking the class count of potable water
plt.title('Potability count', weight='bold')
plt.tight_layout()
#total potability count plotted
non_potable = data[data['Potability']==0]
percent_non_potable = len(non_potable)/ len(data)
print('The percentage of non potable water is: {}%'.format(round(percent_non_potable * 100,4)))
The percentage of non potable water is: 59.6718%
data.nunique()
ph 2011 Hardness 2011 Solids 2011 Chloramines 2011 Sulfate 2011 Conductivity 2011 Organic_carbon 2011 Trihalomethanes 2011 Turbidity 2011 Potability 2 dtype: int64
colors = sns.color_palette('twilight')[0:6]
sns.palplot(colors)
#set up color palette for boxen plot
#Boxen Plot of each Column except Solids
df1 = pd.DataFrame()
df1 = data
df1 = df1.drop("Solids",1)
fig1, ax = plt.subplots(figsize=[20,10])
ax = sns.boxenplot(data=df1, orient="h", palette=colors)
sns.despine(offset=10, trim=True)
plt.title("Boxen Plot of each Column except Solids", fontsize=20);
plt.show()
columns = data[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity']]
columns.shape
(2011, 9)
def distributions(data):
features = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity']
plt.figure(figsize=(16,16))
for i in tqdm_notebook(range(len(data.columns)), desc = 'loading'):
plt.subplot(3,3,i+1)
sns.distplot(data[data.columns[i]], color='red', rug=True)
plt.title(data.columns[i], weight='bold')
plt.tight_layout()
distributions(columns)
def pairplt(data):
sns.pairplot(data, hue='Potability', palette='OrRd')
plt.tight_layout()
pairplt(data)
data.hist(bins=20, color = 'green', figsize=(16,16))
plt.tight_layout()
data.columns
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
dtype='object')
def attributes_and_potability(data):
#getting count of everything with respect to potability
features = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity']
for var in tqdm_notebook(features, desc = 'loading'):
plt.figure(figsize=(16,10))
sns.histplot(data = data, x = data[var], hue ='Potability') # histogram
plt.title(var) # title of the plot
attributes_and_potability(data)
plt.figure(figsize=(16,12))
matrix = np.triu(data.corr()) # matrix to return k -th diagonal zeroed values.
sns.heatmap(data.corr(), annot=True, mask=matrix, cmap='OrRd') # creating correlational map
plt.title('Correlational Map', weight='bold');
def correct_skewness(data):
#correct data skew
data_set = prep.skewcorrect(data, except_columns=['Potability'])
return data_set
data_set = correct_skewness(data) # function calling
██╗░░░░░██╗░░░██╗░█████╗░██╗███████╗███████╗██████╗░░░░░░░███╗░░░███╗██╗░░░░░
██║░░░░░██║░░░██║██╔══██╗██║██╔════╝██╔════╝██╔══██╗░░░░░░████╗░████║██║░░░░░
██║░░░░░██║░░░██║██║░░╚═╝██║█████╗░░█████╗░░██████╔╝█████╗██╔████╔██║██║░░░░░
██║░░░░░██║░░░██║██║░░██╗██║██╔══╝░░██╔══╝░░██╔══██╗╚════╝██║╚██╔╝██║██║░░░░░
███████╗╚██████╔╝╚█████╔╝██║██║░░░░░███████╗██║░░██║░░░░░░██║░╚═╝░██║███████╗
╚══════╝░╚═════╝░░╚════╝░╚═╝╚═╝░░░░░╚══════╝╚═╝░░╚═╝░░░░░░╚═╝░░░░░╚═╝╚══════╝
Started Preprocessor
Skewness in numerical features:
Skewness
Solids 0.595449
Conductivity 0.266670
ph 0.048910
Chloramines 0.012967
Organic_carbon -0.020003
Turbidity -0.033027
Sulfate -0.046523
Trihalomethanes -0.051384
Hardness -0.085174
Skewness Before Transformation for Solids: 0.5958940107371633
Mean before Transformation for Solids : 21917.441374490336, Standard Deviation before Transformation for Solids : 8640.090806098791
Skewness After Transformation for Solids: -1.2308145151482406 Mean before Transformation for Solids : 9.908982178445935, Standard Deviation before Transformation for Solids : 0.4428466687614959
Skewness Before Transformation for Conductivity: 0.26686882669457 Mean before Transformation for Conductivity : 426.5264087317783, Standard Deviation before Transformation for Conductivity : 80.69250214345881
Skewness After Transformation for Conductivity: -0.19929597288588363 Mean before Transformation for Conductivity : 6.03995949808105, Standard Deviation before Transformation for Conductivity : 0.19133953254702343
Skewness Before Transformation for ph: 0.04894678355193397 Mean before Transformation for Ph : 7.085989839285033, Standard Deviation before Transformation for Ph : 1.572945479321659
Skewness After Transformation for ph: -1.1353215889866628 Mean before Transformation for Ph : 2.069646135724062, Standard Deviation before Transformation for Ph : 0.2091935570875167
Skewness Before Transformation for Chloramines: 0.012976277458973314 Mean before Transformation for Chloramines : 7.134338414511035, Standard Deviation before Transformation for Chloramines : 1.5844257944238938
Skewness After Transformation for Chloramines: -0.8633940049210874 Mean before Transformation for Chloramines : 2.075660803668557, Standard Deviation before Transformation for Chloramines : 0.2077454472194916
Skewness Before Transformation for Organic_carbon: -0.020017660786145686 Mean before Transformation for Organic_carbon : 14.357709409067539, Standard Deviation before Transformation for Organic_carbon : 3.3241318633200096
Skewness After Transformation for Organic_carbon: -0.8735022417526354 Mean before Transformation for Organic_carbon : 2.7060706485880073, Standard Deviation before Transformation for Organic_carbon : 0.23318171477866811
Skewness Before Transformation for Turbidity: -0.03305148365834328 Mean before Transformation for Turbidity : 3.9697287992523864, Standard Deviation before Transformation for Turbidity : 0.7801521151552843
Skewness After Transformation for Turbidity: -0.5344601027379543 Mean before Transformation for Turbidity : 1.5905102577808692, Standard Deviation before Transformation for Turbidity : 0.16262524294524067
Skewness Before Transformation for Sulfate: -0.046557696988421486 Mean before Transformation for Sulfate : 333.22467188905864, Standard Deviation before Transformation for Sulfate : 41.194925817413676
Skewness After Transformation for Sulfate: -0.6904323262620179 Mean before Transformation for Sulfate : 5.803934429028258, Standard Deviation before Transformation for Sulfate : 0.1272025102134412
Skewness Before Transformation for Trihalomethanes: -0.051422085759112356 Mean before Transformation for Trihalomethanes : 66.4008593672628, Standard Deviation before Transformation for Trihalomethanes : 16.0731116931157
Skewness After Transformation for Trihalomethanes: -1.1717469055146503 Mean before Transformation for Trihalomethanes : 4.178489111806941, Standard Deviation before Transformation for Trihalomethanes : 0.265483190023941
Skewness Before Transformation for Hardness: -0.08523742258053371 Mean before Transformation for Hardness : 195.9680715571974, Standard Deviation before Transformation for Hardness : 32.62696937081507
Skewness After Transformation for Hardness: -0.8204384680215894 Mean before Transformation for Hardness : 5.268413889542584, Standard Deviation before Transformation for Hardness : 0.17487336609382453
Elapsed Time: 12.224323987960815 seconds
data_set.head()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 2.231816 | 5.372373 | 9.999680 | 2.203795 | 5.880215 | 5.897886 | 2.967154 | 4.618498 | 1.727891 | 0 |
| 4 | 2.311765 | 5.204564 | 9.797015 | 2.021097 | 5.740229 | 5.989990 | 2.530380 | 3.496447 | 1.624341 | 0 |
| 5 | 1.884656 | 5.243403 | 10.266382 | 2.145331 | 5.792033 | 5.640018 | 2.240681 | 4.023884 | 1.269679 | 0 |
| 6 | 2.418042 | 5.517741 | 10.266418 | 2.141642 | 5.978033 | 5.651266 | 2.693931 | 4.449727 | 1.301006 | 0 |
| 7 | 2.265490 | 5.319891 | 9.523185 | 1.716139 | 5.718046 | 6.164593 | 2.592551 | 4.155727 | 1.686663 | 0 |
X = data_set.iloc[:,:-1].values
y = data_set.iloc[:,-1].values
X.shape, y.shape
((2011, 9), (2011,))
scaler = StandardScaler()
X = scaler.fit_transform(X)
X
array([[ 0.77521233, 0.5944836 , 0.20480627, ..., 1.1196562 ,
1.65738784, 0.84476912],
[ 1.15739242, -0.36511915, -0.25283611, ..., -0.7534488 ,
-2.56906051, 0.20803123],
[-0.88430311, -0.14302035, 0.80705136, ..., -1.99582197,
-0.5823542 , -1.97282826],
...,
[ 2.17675501, -4.03732299, 1.38830971, ..., 0.58861173,
-1.61068388, 0.55453041],
[-0.54418505, -0.19320948, 0.59214942, ..., -0.58294994,
-0.22884644, -0.30384534],
[-1.60039285, 0.01688199, 1.94475317, ..., -0.02180318,
0.13716609, 0.63028525]])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # data splitting 80-20% to avoid over fitting
X_train, y_train = shuffle(X_train, y_train) # data shuffling
y_train.shape, y_test.shape
((1608,), (403,))
pipeline = make_pipeline(RobustScaler()) # creating a pipeline for all the models
Random_forest = make_pipeline(pipeline, RandomForestClassifier(random_state=0, min_samples_leaf = 2, n_estimators = 1000))
Decision_tree = make_pipeline(pipeline, DecisionTreeClassifier(random_state=0))
Logistic_regression = make_pipeline(pipeline, LogisticRegression(random_state=0))
svc = make_pipeline(pipeline, SVC(random_state=0))
KNeighbors = make_pipeline(pipeline, KNeighborsClassifier())
Ada_boost = make_pipeline(pipeline, AdaBoostClassifier(random_state=0))
xgboost = make_pipeline(pipeline, XGBClassifier())
gradientboost = make_pipeline(pipeline, GradientBoostingClassifier(random_state=0))
param_dist = {
'RandomForest':Random_forest,
'DecisionTree':Decision_tree,
'LogisticRegression':Logistic_regression,
'svc':svc,
'KNeighbors':KNeighbors,
'AdaBoost':Ada_boost,
'XGB':xgboost,
'GD':gradientboost
}
def MODEL(model):
#run model and generate confusion matrix
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('The accuracy score of the model is: {}%'.format(accuracy_score(y_test, y_pred)))
print('-'*50)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
def model_evaluation(parameter_dictionary):
# calculate precision, recall, f1 score
for name, model in parameter_dictionary.items():
print('-'*50)
print(name)
evaluation = MODEL(model)
return evaluation
evaluation = model_evaluation(param_dist)
--------------------------------------------------
RandomForest
The accuracy score of the model is: 0.7270471464019851%
--------------------------------------------------
[[227 25]
[ 85 66]]
precision recall f1-score support
0 0.73 0.90 0.80 252
1 0.73 0.44 0.55 151
accuracy 0.73 403
macro avg 0.73 0.67 0.68 403
weighted avg 0.73 0.73 0.71 403
--------------------------------------------------
DecisionTree
The accuracy score of the model is: 0.6153846153846154%
--------------------------------------------------
[[175 77]
[ 78 73]]
precision recall f1-score support
0 0.69 0.69 0.69 252
1 0.49 0.48 0.49 151
accuracy 0.62 403
macro avg 0.59 0.59 0.59 403
weighted avg 0.61 0.62 0.62 403
--------------------------------------------------
LogisticRegression
The accuracy score of the model is: 0.6451612903225806%
--------------------------------------------------
[[251 1]
[142 9]]
precision recall f1-score support
0 0.64 1.00 0.78 252
1 0.90 0.06 0.11 151
accuracy 0.65 403
macro avg 0.77 0.53 0.45 403
weighted avg 0.74 0.65 0.53 403
--------------------------------------------------
svc
The accuracy score of the model is: 0.7369727047146402%
--------------------------------------------------
[[234 18]
[ 88 63]]
precision recall f1-score support
0 0.73 0.93 0.82 252
1 0.78 0.42 0.54 151
accuracy 0.74 403
macro avg 0.75 0.67 0.68 403
weighted avg 0.75 0.74 0.71 403
--------------------------------------------------
KNeighbors
The accuracy score of the model is: 0.6501240694789082%
--------------------------------------------------
[[196 56]
[ 85 66]]
precision recall f1-score support
0 0.70 0.78 0.74 252
1 0.54 0.44 0.48 151
accuracy 0.65 403
macro avg 0.62 0.61 0.61 403
weighted avg 0.64 0.65 0.64 403
--------------------------------------------------
AdaBoost
The accuracy score of the model is: 0.6054590570719603%
--------------------------------------------------
[[202 50]
[109 42]]
precision recall f1-score support
0 0.65 0.80 0.72 252
1 0.46 0.28 0.35 151
accuracy 0.61 403
macro avg 0.55 0.54 0.53 403
weighted avg 0.58 0.61 0.58 403
--------------------------------------------------
XGB
The accuracy score of the model is: 0.6724565756823822%
--------------------------------------------------
[[222 30]
[102 49]]
precision recall f1-score support
0 0.69 0.88 0.77 252
1 0.62 0.32 0.43 151
accuracy 0.67 403
macro avg 0.65 0.60 0.60 403
weighted avg 0.66 0.67 0.64 403
--------------------------------------------------
GD
The accuracy score of the model is: 0.6823821339950372%
--------------------------------------------------
[[219 33]
[ 95 56]]
precision recall f1-score support
0 0.70 0.87 0.77 252
1 0.63 0.37 0.47 151
accuracy 0.68 403
macro avg 0.66 0.62 0.62 403
weighted avg 0.67 0.68 0.66 403
accuracy_score_model = {
'RandomForest':71.9,
'DecisionTree':61.6,
'LogisticRegression':63,
'svc':73.6,
'KNeighbors':65,
'AdaBoost':60,
'XGB':66,
'GD':68
}
def models_overview(accuracy_score_model):
#compare models to each other
model_accuracy = list(accuracy_score_model.values())
model_name = list(accuracy_score_model.keys())
g = sns.barplot(x = model_accuracy, y = model_name,palette='OrRd')
plt.title('Models Overview', weight='bold');
return g
over_view = models_overview(accuracy_score_model)
svc = SVC(random_state=0)
svc.fit(X_train, y_train)
SVC(random_state=0)
y_pred = svc.predict(X_test)
def svc_report(y_test, y_pred, X_test, svc):
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True)
plot_roc_curve(svc, X_test, y_test)
print(classification_report(y_test, y_pred))
svc_report(y_test, y_pred, X_test, svc)
precision recall f1-score support
0 0.73 0.93 0.82 252
1 0.78 0.41 0.54 151
accuracy 0.74 403
macro avg 0.76 0.67 0.68 403
weighted avg 0.75 0.74 0.71 403
print('The accuracy score of the model is: {}% '.format(round(accuracy_score(y_test, y_pred)*100, 2)))
The accuracy score of the model is: 73.7%
#SVC is best model